import os
import pandas as pd
import re 
from treelib import Node, Tree
from operator import itemgetter

#creates the file system map
def file_scan(root_dir):
    current_path = os.listdir(root_dir)
    file_count = 0 
    file_list = []
    dir_count = 0
    dir_dict = {}
    
    for element in current_path:
        next_path = os.path.join(root_dir, element)
        if os.path.isdir(next_path) and element != '.git':
            dir_count += 1
            dir_dict[element] = file_scan(next_path)
        elif element != '.git':
            file_count += 1
            file_list.append({'name':element, 'size': os.stat(next_path).st_size})
        dir_dict['total_files'] = file_count
        dir_dict['total_directories'] = dir_count
        dir_dict['files'] = file_list
    return dir_dict


def create_tree(dir_dict):
    tree = Tree()
    tree.create_node("*", "root")
    for element in dir_dict:
        if type(dir_dict[element]) == dict:
            tree.create_node(element,  element, parent="root")
            create_node(dir_dict[element], element, tree)
        elif type(dir_dict[element]) == list:
            for list_element in dir_dict[element]:
                tree.create_node(list_element['name'],  list_element['name'], parent="root")
    return tree

#Fuction complementary to create_tree()
def create_node(child_dict,level, tree):
    for element in child_dict:
        if type(child_dict[element]) == dict:
            tree.create_node(element,  element, parent=level)
            create_node(child_dict[element], element, tree)
        elif type(child_dict[element]) == list:
            for list_element in child_dict[element]:
                tree.create_node(list_element['name'],  list_element['name'], parent=level)
    return


def creates_summary(dir_dict):
    summary_df = pd.DataFrame()
    summary_count = {}
    summary_size = {}
    pie_chart_data = []
    for element in dir_dict:
        if type(dir_dict[element]) == dict:
            append_summary(dir_dict[element], summary_count, summary_size)
        elif type(dir_dict[element]) == list:
            for list_element in dir_dict[element]:
                file_ext = re.search('[\w ]*\.([\w]*)', list_element['name']).group(1)
                summary_count[file_ext] = summary_count.get(file_ext,0) + 1
                summary_size[file_ext] = summary_size.get(file_ext,0) + (list_element['size']/1000) #Represents KB
    
    for element in summary_count:
        summary_df.loc[element, 'count'] = summary_count[element]
        summary_df.loc[element, 'size_KB'] = round(summary_size[element],2)
        pie_chart_data.append([element, summary_size[element]])
    
    pie_chart_data = sorted(pie_chart_data, key=itemgetter(1), reverse=True)
    pie_chart_data.insert(0, ['File type', 'Size'])

    summary_df['size_%'] = round((summary_df['size_KB'].astype(float) / summary_df['size_KB'].sum()) *100,2)
    summary_df = summary_df.sort_values(by=['size_%'], ascending=False)
    return summary_df.to_html(justify = 'center'), pie_chart_data

#Fuction complementary to creates_summary()
def append_summary(child_dict, summary_count, summary_size):
    for element in child_dict:
        if type(child_dict[element]) == dict:
            append_summary(child_dict[element], summary_count, summary_size)
        elif type(child_dict[element]) == list:
            for list_element in child_dict[element]:
                file_ext = re.search('[\w ]*\.([\w]*)', list_element['name']).group(1)
                summary_count[file_ext] = summary_count.get(file_ext,0) + 1
                summary_size[file_ext] = summary_size.get(file_ext,0) + (list_element['size']/1000) #Represents KB
    return


#1. gets notebook dir
dirname = os.getcwd()

#2. Creates main dict to create objects
main_dict = file_scan(os.path.join(dirname, '..//..'))
main_dict

{'total_files': 4,
 'total_directories': 7,
 'files': [{'name': 'index.html', 'size': 9907},
  {'name': 'jupyter.html', 'size': 4597},
  {'name': 'README.md', 'size': 44},
  {'name': 'rmarkdown.html', 'size': 8420}],
 'certificates': {'total_files': 8,
  'total_directories': 0,
  'files': [{'name': 'Data_Analyst_Carso.pdf', 'size': 30610},
   {'name': 'Data_Mining_Carso.pdf', 'size': 30501},
   {'name': 'Data_Science_Fundamentals_with_Python_and_SQL_Program_Coursera.pdf',
    'size': 326958},
   {'name': 'Google_Data_Analytics_Program_Coursera.pdf', 'size': 343510},
   {'name': 'Google_IT_Automation_with_Python_Program_Coursera.pdf',
    'size': 344731},
   {'name': 'Probabilidad_y_Estadistica_Coursera.pdf', 'size': 401904},
   {'name': 'Python_for_Everyone_Coursera.pdf', 'size': 330069},
   {'name': 'Web_Design_for_Everyone_Coursera.pdf', 'size': 348725}]},
 'css': {'total_files': 1,
  'total_directories': 0,
  'files': [{'name': 'style.css', 'size': 4871}]},
 'cv': {'total_files': 2,
  'total_directories': 0,
  'files': [{'name': 'CV_Cesar_Perez_Eng.pdf', 'size': 139929},
   {'name': 'CV_Cesar_Perez_Esp.pdf', 'size': 140741}]},
 'img': {'total_files': 7,
  'total_directories': 1,
  'files': [{'name': 'Montreal.jpg', 'size': 239713},
   {'name': 'MyPic.jpg', 'size': 575484},
   {'name': 'real_monte1.jpg', 'size': 153924},
   {'name': 'real_monte2.jpg', 'size': 1001247},
   {'name': 'real_monte4.jpeg', 'size': 186387},
   {'name': 'Seoul.jpg', 'size': 578631},
   {'name': 'view_home1.jpg', 'size': 925442}],
  'thumbnail': {'total_files': 8,
   'total_directories': 0,
   'files': [{'name': 'Data_Analyst_Carso_thumbnail.png', 'size': 94840},
    {'name': 'Data_Mining_Carso_thumbnail.png', 'size': 76513},
    {'name': 'Data_Science_Fundamentals_with_Python_and_SQL_Program_Coursera_thumbnail.png',
     'size': 353725},
    {'name': 'Google_Data_Analytics_Program_Coursera_thumbnail.png',
     'size': 347224},
    {'name': 'Google_IT_Automation_with_Python_Program_Coursera_thumbnail.png',
     'size': 368312},
    {'name': 'Probabilidad_y_Estadistica_Coursera_thumbnail.png',
     'size': 310200},
    {'name': 'Python_for_Everyone_Coursera_thumbnail.png', 'size': 467097},
    {'name': 'Web_Design_for_Everyone_Coursera_thumbnail.png',
     'size': 491931}]}},
 'js': {'total_files': 1,
  'total_directories': 0,
  'files': [{'name': 'lib.js', 'size': 3698}]},
 'jupyter': {'FileScan': {'total_files': 4,
   'total_directories': 0,
   'files': [{'name': 'FileSystem_Scan.html', 'size': 639501},
    {'name': 'FileSystem_Scan.ipynb', 'size': 11680},
    {'name': 'report.html', 'size': 7588},
    {'name': 'template.html', 'size': 3097}]},
  'total_files': 0,
  'total_directories': 2,
  'files': [],
  'SqlStressTesting': {'total_files': 6,
   'total_directories': 0,
   'files': [{'name': 'report_MSSQL.csv', 'size': 405444},
    {'name': 'report_SQLite.csv', 'size': 453841},
    {'name': 'SQL_stress_testing.html', 'size': 787137},
    {'name': 'SQL_stress_testing.ipynb', 'size': 184085},
    {'name': 'testMSSQL.py', 'size': 2125},
    {'name': 'testSQLite.py', 'size': 1655}]}},
 'rmarkdown': {'total_files': 4,
  'total_directories': 0,
  'files': [{'name': 'Bicycle-Trip-Analysis-R.html', 'size': 1388017},
   {'name': 'Dating-app-review-analysis-R.html', 'size': 1493439},
   {'name': 'Fitness-Device-Usage-Analysis-R.html', 'size': 1597277},
   {'name': 'hotel-booking-analysis-R.html', 'size': 1853495}]}}


#3. File tree
tree_filename = os.path.join(dirname, 'tree.txt')
file_tree = create_tree(main_dict)

file_tree.save2file(tree_filename)
tree_txt_reader = open(tree_filename, 'r', encoding='utf-8')
tree_txt = ""

for line in tree_txt_reader.readlines():
    line = '<pre>'+line+'</pre>'
    tree_txt += line

tree_txt_reader.close()
os.remove(tree_filename)
tree_txt

'<pre>*\n</pre><pre>├── README.md\n</pre><pre>├── certificates\n</pre><pre>│   ├── Data_Analyst_Carso.pdf\n</pre><pre>│   ├── Data_Mining_Carso.pdf\n</pre><pre>│   ├── Data_Science_Fundamentals_with_Python_and_SQL_Program_Coursera.pdf\n</pre><pre>│   ├── Google_Data_Analytics_Program_Coursera.pdf\n</pre><pre>│   ├── Google_IT_Automation_with_Python_Program_Coursera.pdf\n</pre><pre>│   ├── Probabilidad_y_Estadistica_Coursera.pdf\n</pre><pre>│   ├── Python_for_Everyone_Coursera.pdf\n</pre><pre>│   └── Web_Design_for_Everyone_Coursera.pdf\n</pre><pre>├── css\n</pre><pre>│   └── style.css\n</pre><pre>├── cv\n</pre><pre>│   ├── CV_Cesar_Perez_Eng.pdf\n</pre><pre>│   └── CV_Cesar_Perez_Esp.pdf\n</pre><pre>├── img\n</pre><pre>│   ├── Montreal.jpg\n</pre><pre>│   ├── MyPic.jpg\n</pre><pre>│   ├── Seoul.jpg\n</pre><pre>│   ├── real_monte1.jpg\n</pre><pre>│   ├── real_monte2.jpg\n</pre><pre>│   ├── real_monte4.jpeg\n</pre><pre>│   ├── thumbnail\n</pre><pre>│   │   ├── Data_Analyst_Carso_thumbnail.png\n</pre><pre>│   │   ├── Data_Mining_Carso_thumbnail.png\n</pre><pre>│   │   ├── Data_Science_Fundamentals_with_Python_and_SQL_Program_Coursera_thumbnail.png\n</pre><pre>│   │   ├── Google_Data_Analytics_Program_Coursera_thumbnail.png\n</pre><pre>│   │   ├── Google_IT_Automation_with_Python_Program_Coursera_thumbnail.png\n</pre><pre>│   │   ├── Probabilidad_y_Estadistica_Coursera_thumbnail.png\n</pre><pre>│   │   ├── Python_for_Everyone_Coursera_thumbnail.png\n</pre><pre>│   │   └── Web_Design_for_Everyone_Coursera_thumbnail.png\n</pre><pre>│   └── view_home1.jpg\n</pre><pre>├── index.html\n</pre><pre>├── js\n</pre><pre>│   └── lib.js\n</pre><pre>├── jupyter\n</pre><pre>│   ├── FileScan\n</pre><pre>│   │   ├── FileSystem_Scan.html\n</pre><pre>│   │   ├── FileSystem_Scan.ipynb\n</pre><pre>│   │   ├── report.html\n</pre><pre>│   │   └── template.html\n</pre><pre>│   └── SqlStressTesting\n</pre><pre>│       ├── SQL_stress_testing.html\n</pre><pre>│       ├── SQL_stress_testing.ipynb\n</pre><pre>│       ├── report_MSSQL.csv\n</pre><pre>│       ├── report_SQLite.csv\n</pre><pre>│       ├── testMSSQL.py\n</pre><pre>│       └── testSQLite.py\n</pre><pre>├── jupyter.html\n</pre><pre>├── rmarkdown\n</pre><pre>│   ├── Bicycle-Trip-Analysis-R.html\n</pre><pre>│   ├── Dating-app-review-analysis-R.html\n</pre><pre>│   ├── Fitness-Device-Usage-Analysis-R.html\n</pre><pre>│   └── hotel-booking-analysis-R.html\n</pre><pre>└── rmarkdown.html\n</pre>'


#4. Pie Chart and table
table_summary, pie_chart = creates_summary(main_dict)
table_summary

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: center;">\n      <th></th>\n      <th>count</th>\n      <th>size_KB</th>\n      <th>size_%</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>html</th>\n      <td>11.0</td>\n      <td>7792.48</td>\n      <td>44.61</td>\n    </tr>\n    <tr>\n      <th>jpg</th>\n      <td>6.0</td>\n      <td>3474.44</td>\n      <td>19.89</td>\n    </tr>\n    <tr>\n      <th>png</th>\n      <td>8.0</td>\n      <td>2509.84</td>\n      <td>14.37</td>\n    </tr>\n    <tr>\n      <th>pdf</th>\n      <td>10.0</td>\n      <td>2437.68</td>\n      <td>13.95</td>\n    </tr>\n    <tr>\n      <th>csv</th>\n      <td>2.0</td>\n      <td>859.29</td>\n      <td>4.92</td>\n    </tr>\n    <tr>\n      <th>ipynb</th>\n      <td>2.0</td>\n      <td>195.77</td>\n      <td>1.12</td>\n    </tr>\n    <tr>\n      <th>jpeg</th>\n      <td>1.0</td>\n      <td>186.39</td>\n      <td>1.07</td>\n    </tr>\n    <tr>\n      <th>css</th>\n      <td>1.0</td>\n      <td>4.87</td>\n      <td>0.03</td>\n    </tr>\n    <tr>\n      <th>js</th>\n      <td>1.0</td>\n      <td>3.70</td>\n      <td>0.02</td>\n    </tr>\n    <tr>\n      <th>py</th>\n      <td>2.0</td>\n      <td>3.78</td>\n      <td>0.02</td>\n    </tr>\n    <tr>\n      <th>md</th>\n      <td>1.0</td>\n      <td>0.04</td>\n      <td>0.00</td>\n    </tr>\n  </tbody>\n</table>'


pie_chart

[['File type', 'Size'],
 ['html', 7792.475],
 ['jpg', 3474.441],
 ['png', 2509.842],
 ['pdf', 2437.678],
 ['csv', 859.2850000000001],
 ['ipynb', 195.76500000000001],
 ['jpeg', 186.387],
 ['css', 4.871],
 ['py', 3.7800000000000002],
 ['js', 3.698],
 ['md', 0.044]]


# #5. Create report
template_filename = os.path.join(dirname, 'template.html')

template_reader = open(template_filename, 'r')
template_content = template_reader.read()
template_reader.close()

new_content = re.sub("pie_chart_data_goes_here", str(pie_chart), template_content)
new_content = re.sub("table_goes_here", str(table_summary), new_content)
new_content = re.sub("file_tree_goes_here", tree_txt, new_content)

report_filename = os.path.join(dirname, 'report.html')
template_writer = open(report_filename, 'w', encoding="utf-8")
template_writer.write(new_content)

6909

Using Recursive Functions to Scan the File System¶

Introduction.¶

Step 1. Create functions.¶

File_Scan¶

create_tree and create_node¶

creates_summary and append_summary¶

Step 2. Generate Data¶

Step 3. Create HTML report¶

Here is the final report!¶